if (!require("plotly")) install.packages("plotly")
## Loading required package: plotly
## Loading required package: ggplot2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
if (!require("gridExtra")) install.packages("gridExtra")
## Loading required package: gridExtra
if (!require("purrr")) install.packages("purrr")
## Loading required package: purrr
if (!require("Hmisc")) install.packages("Hmisc")
## Loading required package: Hmisc
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following object is masked from 'package:plotly':
##
## subplot
## The following objects are masked from 'package:base':
##
## format.pval, units
if (!require("naniar")) install.packages("naniar")
## Loading required package: naniar
if (!require("lubridate")) install.packages("lubridate")
## Loading required package: lubridate
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
if (!require("forcats")) install.packages("forcats")
## Loading required package: forcats
if (!require("skimr")) install.packages("skimr")
## Loading required package: skimr
##
## Attaching package: 'skimr'
## The following object is masked from 'package:naniar':
##
## n_complete
## The following object is masked from 'package:stats':
##
## filter
if (!require("stats")) install.packages("stats")
if (!require("scales")) install.packages("scales")
## Loading required package: scales
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
library(plotly)
library(gridExtra)
library(purrr)
library(Hmisc)
library(naniar)
library(lubridate)
library(forcats)
library(skimr)
library(stats)
library(scales)
library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ tibble 2.1.3 ✔ readr 1.3.1
## ✔ tidyr 0.8.3 ✔ dplyr 0.8.3
## ✔ tibble 2.1.3 ✔ stringr 1.4.0
## ── Conflicts ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ lubridate::as.difftime() masks base::as.difftime()
## ✖ readr::col_factor() masks scales::col_factor()
## ✖ dplyr::combine() masks gridExtra::combine()
## ✖ lubridate::date() masks base::date()
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks skimr::filter(), plotly::filter(), stats::filter()
## ✖ lubridate::intersect() masks base::intersect()
## ✖ dplyr::lag() masks stats::lag()
## ✖ lubridate::setdiff() masks base::setdiff()
## ✖ dplyr::src() masks Hmisc::src()
## ✖ dplyr::summarize() masks Hmisc::summarize()
## ✖ lubridate::union() masks base::union()
adoption <- read_csv("adoption_final.csv")
## Parsed with column specification:
## cols(
## Month = col_character(),
## Customer_ID = col_character(),
## Customer_size = col_character(),
## Geo_Code = col_character(),
## Visualize = col_character(),
## Alert = col_character(),
## Report = col_character()
## )
april <- read_csv("april_billing_final.csv")
## Parsed with column specification:
## cols(
## Billing_month = col_datetime(format = ""),
## Customer_ID = col_character(),
## Registration_date = col_date(format = ""),
## product_id = col_double(),
## Billed_amount = col_double()
## )
june <- read_csv("june_billing_final.csv")
## Parsed with column specification:
## cols(
## Billing_month = col_datetime(format = ""),
## Customer_ID = col_character(),
## Registration_date = col_date(format = ""),
## product_id = col_double(),
## Billed_amount = col_double()
## )
may <- read_csv("may_billing_final.csv")
## Parsed with column specification:
## cols(
## Billing_month = col_datetime(format = ""),
## Customer_ID = col_character(),
## Registration_date = col_date(format = ""),
## product_id = col_double(),
## Billed_amount = col_double()
## )
April: 45,604
May: 47,272
June: 47,149
Missing 16 GEO-code
140,025 observation –> remove 16 missing values in GEO-code
describe(adoption)
## adoption
##
## 7 Variables 140025 Observations
## ---------------------------------------------------------------------------
## Month
## n missing distinct
## 140025 0 3
##
## Value April-19 June-19 May-19
## Frequency 45604 47149 47272
## Proportion 0.326 0.337 0.338
## ---------------------------------------------------------------------------
## Customer_ID
## n missing distinct
## 140025 0 52829
##
## lowest : 004676561936772 00a790034978456 00b681334781982 00D351757638125 00F721755330737
## highest: zzX629055216030 ZzX879532807956 zZy662437699323 zzy718159275481 Zzy795772531160
## ---------------------------------------------------------------------------
## Customer_size
## n missing distinct
## 140025 0 3
##
## Value Large Mid Small
## Frequency 392 1714 137919
## Proportion 0.003 0.012 0.985
## ---------------------------------------------------------------------------
## Geo_Code
## n missing distinct
## 140009 16 7
##
## Value AMER APAC CHNA EMEA
## Frequency 71874 17766 6388 30877
## Proportion 0.513 0.127 0.046 0.221
##
## Value GEO-UNCLAIMED GLBL JAPN
## Frequency 47 634 12423
## Proportion 0.000 0.005 0.089
## ---------------------------------------------------------------------------
## Visualize
## n missing distinct
## 140025 0 2
##
## Value No Yes
## Frequency 104168 35857
## Proportion 0.744 0.256
## ---------------------------------------------------------------------------
## Alert
## n missing distinct
## 140025 0 2
##
## Value No Yes
## Frequency 131290 8735
## Proportion 0.938 0.062
## ---------------------------------------------------------------------------
## Report
## n missing distinct
## 140025 0 2
##
## Value No Yes
## Frequency 131927 8098
## Proportion 0.942 0.058
## ---------------------------------------------------------------------------
#Show rows wiht missing value
adopt_missing <- adoption[is.na(adoption$Geo_Code),]
adopt_missing
## # A tibble: 16 x 7
## Month Customer_ID Customer_size Geo_Code Visualize Alert Report
## <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 April-19 3ve622950616356 Small <NA> No No No
## 2 May-19 3ve622950616356 Small <NA> No No No
## 3 June-19 3ve622950616356 Small <NA> No No No
## 4 April-19 gMS335204256682 Small <NA> No No No
## 5 May-19 gMS335204256682 Mid <NA> No No No
## 6 June-19 gMS335204256682 Mid <NA> No No No
## 7 April-19 Dyu886342809534 Small <NA> No No No
## 8 May-19 Dyu886342809534 Small <NA> No No No
## 9 June-19 Dyu886342809534 Small <NA> No No No
## 10 June-19 4Yq076930086263 Small <NA> No No No
## 11 June-19 sNr024265486996 Small <NA> No No No
## 12 June-19 uLn203194071015 Small <NA> No No No
## 13 June-19 f2P113951421659 Small <NA> No No No
## 14 June-19 leE241742919990 Small <NA> No No No
## 15 June-19 EjH888820866536 Small <NA> No No No
## 16 June-19 JWE850603294351 Small <NA> No No No
unique(adopt_missing$Customer_ID)
## [1] "3ve622950616356" "gMS335204256682" "Dyu886342809534"
## [4] "4Yq076930086263" "sNr024265486996" "uLn203194071015"
## [7] "f2P113951421659" "leE241742919990" "EjH888820866536"
## [10] "JWE850603294351"
length(unique(adopt_missing$Customer_ID)) # 10 id with missing geo_code
## [1] 10
#Filter out these 16 rows
adoption <- adoption %>% filter(!is.na(Geo_Code ))
adoption %>%
group_by(Customer_size) %>% summarise(count=n()) %>% mutate(pct = round(count / sum(count),2)) %>%
ggplot(aes(x = reorder(Customer_size, pct), y = pct)) + geom_bar(stat='identity') +
scale_y_continuous(labels = scales::percent) +
geom_text(aes(label=paste(count), y=pct+0.025), size=4)
adoption %>%
group_by(Geo_Code) %>% summarise(count=n()) %>% mutate(pct = round(count / sum(count),2)) %>%
ggplot(aes(x = reorder(Geo_Code, pct), y = pct)) + geom_bar(stat='identity') +
scale_y_continuous(labels = scales::percent) +
geom_text(aes(label=paste(count), y=pct+0.025), size=4)
##+ Service
grid.arrange(
adoption %>%
group_by(Visualize) %>% summarise(count=n()) %>% mutate(pct = round(count / sum(count),2)) %>%
ggplot(aes(x = reorder(Visualize, pct), y = pct)) + geom_bar(stat='identity') +
scale_y_continuous(labels = scales::percent) +
geom_text(aes(label=paste(count), y=pct+0.025), size=4),
adoption %>%
group_by(Alert) %>% summarise(count=n()) %>% mutate(pct = round(count / sum(count),2)) %>%
ggplot(aes(x = reorder(Alert, pct), y = pct)) + geom_bar(stat='identity') +
scale_y_continuous(labels = scales::percent) +
geom_text(aes(label=paste(count), y=pct+0.025), size=4),
adoption %>%
group_by(Report) %>% summarise(count=n()) %>% mutate(pct = round(count / sum(count),2)) %>%
ggplot(aes(x = reorder(Report, pct), y = pct)) + geom_bar(stat='identity') +
scale_y_continuous(labels = scales::percent) +
geom_text(aes(label=paste(count), y=pct+0.025), size=4),ncol = 3)
grid.arrange(
adoption %>%
group_by(Customer_size, Visualize) %>% summarise(count=n()) %>% mutate(pct = round(count / sum(count),2)) %>%
ggplot(aes(x = reorder(Customer_size, pct), y = pct, fill = Visualize)) + geom_bar(stat='identity', position = 'dodge') +
scale_y_continuous(labels = scales::percent),
adoption %>%
group_by(Customer_size,Alert) %>% summarise(count=n()) %>% mutate(pct = round(count / sum(count),2)) %>%
ggplot(aes(x = reorder(Customer_size, pct), y = pct, fill = Alert)) + geom_bar(stat='identity', position = 'dodge') +
scale_y_continuous(labels = scales::percent) ,
adoption %>%
group_by(Customer_size,Report) %>% summarise(count=n()) %>% mutate(pct = round(count / sum(count),2)) %>%
ggplot(aes(x = reorder(Customer_size, pct), y = pct, fill = Report)) + geom_bar(stat='identity', position = 'dodge') +
scale_y_continuous(labels = scales::percent),ncol = 1)
grid.arrange(
adoption %>%
group_by(Geo_Code, Visualize) %>% summarise(count=n()) %>% mutate(pct = round(count / sum(count),2)) %>%
ggplot(aes(x = reorder(Geo_Code, pct), y = pct, fill = Visualize)) + geom_bar(stat='identity', position = 'dodge') +
scale_y_continuous(labels = scales::percent),
adoption %>%
group_by(Geo_Code,Alert) %>% summarise(count=n()) %>% mutate(pct = round(count / sum(count),2)) %>%
ggplot(aes(x = reorder(Geo_Code, pct), y = pct, fill = Alert)) + geom_bar(stat='identity', position = 'dodge') +
scale_y_continuous(labels = scales::percent) ,
adoption %>%
group_by(Geo_Code,Report) %>% summarise(count=n()) %>% mutate(pct = round(count / sum(count),2)) %>%
ggplot(aes(x = reorder(Geo_Code, pct), y = pct, fill = Report)) + geom_bar(stat='identity', position = 'dodge') +
scale_y_continuous(labels = scales::percent),ncol = 1)
# Create adoption-category data
VAR <- adoption %>%
filter(grepl('Yes', Visualize) & grepl('Yes', Alert) & grepl('Yes', Report)) %>%
mutate(Tools = "VAR")
VA <- adoption %>%
filter(grepl('Yes', Visualize) & grepl('Yes', Alert) & grepl('No', Report)) %>%
mutate(Tools = "VA")
VR <- adoption %>%
filter(grepl('Yes', Visualize) & grepl('No', Alert) & grepl('Yes', Report)) %>%
mutate(Tools = "VR")
AR <- adoption %>%
filter(grepl('No', Visualize) & grepl('Yes', Alert) & grepl('Yes', Report)) %>%
mutate(Tools = "AR")
V <- adoption %>%
filter(grepl('Yes', Visualize) & grepl('No', Alert) & grepl('No', Report)) %>%
mutate(Tools = "V")
A <- adoption %>%
filter(grepl('No', Visualize) & grepl('Yes', Alert) & grepl('No', Report)) %>%
mutate(Tools = "A")
R <- adoption %>%
filter(grepl('No', Visualize) & grepl('No', Alert) & grepl('Yes', Report)) %>%
mutate(Tools = "R")
N <- adoption %>%
filter(grepl('No', Visualize) & grepl('No', Alert) & grepl('No', Report)) %>%
mutate(Tools = "N")
adoption<- bind_rows(VAR, VA, AR, VR,
V, A, R, N) %>% mutate(Tools = as.factor(Tools))
describe(april)
## april
##
## 5 Variables 528806 Observations
## ---------------------------------------------------------------------------
## Billing_month
## n missing distinct Info Mean Gmd
## 528806 0 1 0 2019-04-01 1970-01-01
##
## Value 1554076800
## Frequency 528806
## Proportion 1
## ---------------------------------------------------------------------------
## Customer_ID
## n missing distinct
## 528806 0 45876
##
## lowest : 004676561936772 00a790034978456 00b681334781982 00D351757638125 00F721755330737
## highest: ZZx567245693938 zzX629055216030 zZy662437699323 zzy718159275481 Zzy795772531160
## ---------------------------------------------------------------------------
## Registration_date
## n missing distinct
## 528806 0 4279
##
## lowest : 2004-08-21 2004-08-22 2004-09-02 2004-09-18 2004-10-04
## highest: 2019-04-22 2019-04-23 2019-04-24 2019-04-25 2019-04-26
## ---------------------------------------------------------------------------
## product_id
## n missing distinct Info Mean Gmd .05 .10
## 506854 21952 2177 0.997 7735 9009 8 10
## .25 .50 .75 .90 .95
## 938 3299 11921 17724 25460
##
## lowest : 2 7 8 10 12, highest: 104054 104099 104149 104166 104398
## ---------------------------------------------------------------------------
## Billed_amount
## n missing distinct Info Mean Gmd .05 .10
## 528806 0 294652 0.973 404.7 733 0.000 0.000
## .25 .50 .75 .90 .95
## 0.000 2.137 108.000 911.703 1798.050
##
## lowest : -16007.837 -10051.695 -4266.879 -4094.760 -3978.232
## highest: 297159.973 367635.247 412586.133 414781.007 854206.203
## ---------------------------------------------------------------------------
april %>%
keep(is.numeric) %>%
gather() %>%
ggplot(aes(value)) +
facet_wrap(~ key, scales = "free") +
geom_histogram(bins = 100)
## Warning: Removed 21952 rows containing non-finite values (stat_bin).
#Show missing values
april[is.na(april$product_id),]
## # A tibble: 21,952 x 5
## Billing_month Customer_ID Registration_da… product_id
## <dttm> <chr> <date> <dbl>
## 1 2019-04-01 00:00:00 0046765619… 2015-06-18 NA
## 2 2019-04-01 00:00:00 00F7217553… 2012-01-16 NA
## 3 2019-04-01 00:00:00 00R0968053… 2012-01-30 NA
## 4 2019-04-01 00:00:00 00S8151738… 2016-02-04 NA
## 5 2019-04-01 00:00:00 00X7404388… 2017-06-01 NA
## 6 2019-04-01 00:00:00 0180506686… 2018-10-15 NA
## 7 2019-04-01 00:00:00 0185262367… 2007-10-20 NA
## 8 2019-04-01 00:00:00 01J1125053… 2015-11-14 NA
## 9 2019-04-01 00:00:00 01O6937090… 2015-05-28 NA
## 10 2019-04-01 00:00:00 01R0858474… 2017-11-29 NA
## # … with 21,942 more rows, and 1 more variable: Billed_amount <dbl>
#Assign missing value as 0
#april$product_id[is.na(april$product_id)] <- 0
range(april$Registration_date)
## [1] "2004-08-21" "2019-04-26"
april %>% ggplot(aes(x = Registration_date ))+ geom_histogram(bins = 17)
summary(april$Billed_amount)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -16007.8 0.0 2.1 404.7 108.0 854206.2
april %>% ggplot(aes(y = Billed_amount)) + geom_boxplot()
april[april$Billed_amount <0,] #133 negative billed amount
## # A tibble: 133 x 5
## Billing_month Customer_ID Registration_da… product_id
## <dttm> <chr> <date> <dbl>
## 1 2019-04-01 00:00:00 07d8117843… 2016-06-16 1925
## 2 2019-04-01 00:00:00 0gL3474882… 2017-12-02 1925
## 3 2019-04-01 00:00:00 0hT1568119… 2018-08-10 1925
## 4 2019-04-01 00:00:00 1R30645204… 2017-08-02 1925
## 5 2019-04-01 00:00:00 1r38382634… 2017-11-06 1925
## 6 2019-04-01 00:00:00 2WN0679333… 2019-01-08 1925
## 7 2019-04-01 00:00:00 2bX4449667… 2015-07-28 1925
## 8 2019-04-01 00:00:00 2bx8772712… 2015-12-14 1925
## 9 2019-04-01 00:00:00 2iL2964594… 2018-01-18 1925
## 10 2019-04-01 00:00:00 2jj1545435… 2014-02-09 1925
## # … with 123 more rows, and 1 more variable: Billed_amount <dbl>
april[april$Billed_amount == 0,] # 157k have 0 billed amount
## # A tibble: 157,667 x 5
## Billing_month Customer_ID Registration_da… product_id
## <dttm> <chr> <date> <dbl>
## 1 2019-04-01 00:00:00 0046765619… 2015-06-18 1958
## 2 2019-04-01 00:00:00 0046765619… 2015-06-18 11813
## 3 2019-04-01 00:00:00 0046765619… 2015-06-18 NA
## 4 2019-04-01 00:00:00 00D3517576… 2009-06-16 91
## 5 2019-04-01 00:00:00 00D3517576… 2009-06-16 10298
## 6 2019-04-01 00:00:00 00D3517576… 2009-06-16 13350
## 7 2019-04-01 00:00:00 00F7217553… 2012-01-16 11813
## 8 2019-04-01 00:00:00 00F7217553… 2012-01-16 20597
## 9 2019-04-01 00:00:00 00F7217553… 2012-01-16 NA
## 10 2019-04-01 00:00:00 00R0968053… 2012-01-30 10298
## # … with 157,657 more rows, and 1 more variable: Billed_amount <dbl>
april[april$Billed_amount > 1000,]
## # A tibble: 49,001 x 5
## Billing_month Customer_ID Registration_da… product_id
## <dttm> <chr> <date> <dbl>
## 1 2019-04-01 00:00:00 0046765619… 2015-06-18 10
## 2 2019-04-01 00:00:00 00F7217553… 2012-01-16 10
## 3 2019-04-01 00:00:00 00F7217553… 2012-01-16 938
## 4 2019-04-01 00:00:00 00R0968053… 2012-01-30 10
## 5 2019-04-01 00:00:00 00R0968053… 2012-01-30 938
## 6 2019-04-01 00:00:00 00S6153818… 2018-09-11 10
## 7 2019-04-01 00:00:00 00S8151738… 2016-02-04 10
## 8 2019-04-01 00:00:00 00T0630891… 2018-02-14 10
## 9 2019-04-01 00:00:00 00T4181469… 2016-02-12 10
## 10 2019-04-01 00:00:00 00X7404388… 2017-06-01 8
## # … with 48,991 more rows, and 1 more variable: Billed_amount <dbl>
#Visualize smaller range of billed_amount
test_bill <- april[april$Billed_amount>0 & april$Billed_amount<1500,]
test_bill %>% ggplot(aes(Billed_amount)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
describe(may)
## may
##
## 5 Variables 552263 Observations
## ---------------------------------------------------------------------------
## Billing_month
## n missing distinct Info Mean Gmd
## 552263 0 1 0 2019-05-01 1970-01-01
##
## Value 1556668800
## Frequency 552263
## Proportion 1
## ---------------------------------------------------------------------------
## Customer_ID
## n missing distinct
## 552263 0 47516
##
## lowest : 004676561936772 00a790034978456 00b681334781982 00F721755330737 00n198760607655
## highest: zZX252173851580 ZZx567245693938 ZzX879532807956 zZy662437699323 Zzy795772531160
## ---------------------------------------------------------------------------
## Registration_date
## n missing distinct
## 552263 0 4311
##
## lowest : 2004-08-21 2004-08-22 2004-09-02 2004-09-11 2004-09-18
## highest: 2019-05-22 2019-05-23 2019-05-24 2019-05-27 2019-05-29
## ---------------------------------------------------------------------------
## product_id
## n missing distinct Info Mean Gmd .05 .10
## 529250 23013 2262 0.997 7921 9244 8 10
## .25 .50 .75 .90 .95
## 938 3299 11921 18651 25460
##
## lowest : 2 7 8 10 12, highest: 111221 111223 111227 111433 111434
## ---------------------------------------------------------------------------
## Billed_amount
## n missing distinct Info Mean Gmd .05 .10
## 552263 0 307740 0.973 408.4 740.1 0.000 0.000
## .25 .50 .75 .90 .95
## 0.000 2.088 108.624 915.271 1811.600
##
## lowest : -17555.206 -7631.000 -6906.177 -5038.969 -4901.200
## highest: 287151.208 417530.563 424886.635 449439.475 900500.004
## ---------------------------------------------------------------------------
may %>%
keep(is.numeric) %>%
gather() %>%
ggplot(aes(value)) +
facet_wrap(~ key, scales = "free") +
geom_histogram(bins = 100)
## Warning: Removed 23013 rows containing non-finite values (stat_bin).
#Show missing values
may[is.na(may$product_id),]
## # A tibble: 23,013 x 5
## Billing_month Customer_ID Registration_da… product_id
## <dttm> <chr> <date> <dbl>
## 1 2019-05-01 00:00:00 0046765619… 2015-06-18 NA
## 2 2019-05-01 00:00:00 00F7217553… 2012-01-16 NA
## 3 2019-05-01 00:00:00 00R0968053… 2012-01-30 NA
## 4 2019-05-01 00:00:00 00S8151738… 2016-02-04 NA
## 5 2019-05-01 00:00:00 00X7404388… 2017-06-01 NA
## 6 2019-05-01 00:00:00 0180506686… 2018-10-15 NA
## 7 2019-05-01 00:00:00 0185262367… 2007-10-20 NA
## 8 2019-05-01 00:00:00 01J1125053… 2015-11-14 NA
## 9 2019-05-01 00:00:00 01O6937090… 2015-05-28 NA
## 10 2019-05-01 00:00:00 01R0858474… 2017-11-29 NA
## # … with 23,003 more rows, and 1 more variable: Billed_amount <dbl>
#Assign missing value as 0
#may$product_id[is.na(may$product_id)] <- 0
range(may$Registration_date)
## [1] "2004-08-21" "2019-05-29"
may %>% ggplot(aes(x = Registration_date ))+ geom_histogram(bins = 17)
summary(may$Billed_amount)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -17555.2 0.0 2.1 408.4 108.6 900500.0
may %>% ggplot(aes(y = Billed_amount)) + geom_boxplot()
may[may$Billed_amount <0,] #124 negative billed amount
## # A tibble: 124 x 5
## Billing_month Customer_ID Registration_da… product_id
## <dttm> <chr> <date> <dbl>
## 1 2019-05-01 00:00:00 0gL3474882… 2017-12-02 1925
## 2 2019-05-01 00:00:00 0hT1568119… 2018-08-10 1925
## 3 2019-05-01 00:00:00 1CD7032740… 2014-11-18 1925
## 4 2019-05-01 00:00:00 1ff8916555… 2019-02-19 1925
## 5 2019-05-01 00:00:00 1y80992059… 2017-02-20 1925
## 6 2019-05-01 00:00:00 2bX4449667… 2015-07-28 1925
## 7 2019-05-01 00:00:00 2iL2964594… 2018-01-18 1925
## 8 2019-05-01 00:00:00 2m02498354… 2011-08-23 1925
## 9 2019-05-01 00:00:00 31O5385888… 2014-05-27 1925
## 10 2019-05-01 00:00:00 3Tn6704020… 2012-12-19 1925
## # … with 114 more rows, and 1 more variable: Billed_amount <dbl>
may[may$Billed_amount == 0,] # 165k have 0 billed amount
## # A tibble: 165,875 x 5
## Billing_month Customer_ID Registration_da… product_id
## <dttm> <chr> <date> <dbl>
## 1 2019-05-01 00:00:00 0046765619… 2015-06-18 1958
## 2 2019-05-01 00:00:00 0046765619… 2015-06-18 2155
## 3 2019-05-01 00:00:00 0046765619… 2015-06-18 11813
## 4 2019-05-01 00:00:00 0046765619… 2015-06-18 26720
## 5 2019-05-01 00:00:00 0046765619… 2015-06-18 NA
## 6 2019-05-01 00:00:00 00F7217553… 2012-01-16 11813
## 7 2019-05-01 00:00:00 00F7217553… 2012-01-16 20597
## 8 2019-05-01 00:00:00 00F7217553… 2012-01-16 NA
## 9 2019-05-01 00:00:00 00R0968053… 2012-01-30 10298
## 10 2019-05-01 00:00:00 00R0968053… 2012-01-30 NA
## # … with 165,865 more rows, and 1 more variable: Billed_amount <dbl>
may[may$Billed_amount > 1000,] #51k have bill > 1000
## # A tibble: 51,373 x 5
## Billing_month Customer_ID Registration_da… product_id
## <dttm> <chr> <date> <dbl>
## 1 2019-05-01 00:00:00 0046765619… 2015-06-18 10
## 2 2019-05-01 00:00:00 00F7217553… 2012-01-16 10
## 3 2019-05-01 00:00:00 00F7217553… 2012-01-16 938
## 4 2019-05-01 00:00:00 00R0968053… 2012-01-30 10
## 5 2019-05-01 00:00:00 00R0968053… 2012-01-30 938
## 6 2019-05-01 00:00:00 00S6153818… 2018-09-11 10
## 7 2019-05-01 00:00:00 00S8151738… 2016-02-04 10
## 8 2019-05-01 00:00:00 00T0630891… 2018-02-14 10
## 9 2019-05-01 00:00:00 00T4181469… 2016-02-12 10
## 10 2019-05-01 00:00:00 00X7404388… 2017-06-01 8
## # … with 51,363 more rows, and 1 more variable: Billed_amount <dbl>
#Visualize smaller range of billed_amount
test_bill_may <- may[may$Billed_amount>0 & may$Billed_amount<1500,]
test_bill_may %>% ggplot(aes(Billed_amount)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
describe(june)
## june
##
## 5 Variables 556261 Observations
## ---------------------------------------------------------------------------
## Billing_month
## n missing distinct Info Mean Gmd
## 556261 0 1 0 2019-06-01 1970-01-01
##
## Value 1559347200
## Frequency 556261
## Proportion 1
## ---------------------------------------------------------------------------
## Customer_ID
## n missing distinct
## 556261 0 47442
##
## lowest : 004676561936772 00a790034978456 00b681334781982 00F721755330737 00n198760607655
## highest: ZZw803293842918 zZX252173851580 ZZx567245693938 zZy662437699323 Zzy795772531160
## ---------------------------------------------------------------------------
## Registration_date
## n missing distinct
## 556261 0 4332
##
## lowest : 2004-08-21 2004-08-22 2004-09-02 2004-09-11 2004-09-18
## highest: 2019-06-23 2019-06-24 2019-06-27 2019-06-29 2019-06-30
## ---------------------------------------------------------------------------
## product_id
## n missing distinct Info Mean Gmd .05 .10
## 533283 22978 2300 0.997 8065 9421 8 10
## .25 .50 .75 .90 .95
## 938 3299 11921 18651 25460
##
## lowest : 2 7 8 10 12, highest: 116105 116111 116114 116531 116533
## ---------------------------------------------------------------------------
## Billed_amount
## n missing distinct Info Mean Gmd .05 .10
## 556261 0 310357 0.974 407 737.9 0.00 0.00
## .25 .50 .75 .90 .95
## 0.00 2.17 108.00 902.72 1794.85
##
## lowest : -17894.279 -6223.696 -4109.864 -3840.000 -3741.370
## highest: 376950.934 385215.775 417226.326 445296.786 919030.922
## ---------------------------------------------------------------------------
june %>%
keep(is.numeric) %>%
gather() %>%
ggplot(aes(value)) +
facet_wrap(~ key, scales = "free") +
geom_histogram(bins = 100)
## Warning: Removed 22978 rows containing non-finite values (stat_bin).
#Show missing values
june[is.na(june$product_id),]
## # A tibble: 22,978 x 5
## Billing_month Customer_ID Registration_da… product_id
## <dttm> <chr> <date> <dbl>
## 1 2019-06-01 00:00:00 0046765619… 2015-06-18 NA
## 2 2019-06-01 00:00:00 00F7217553… 2012-01-16 NA
## 3 2019-06-01 00:00:00 00R0968053… 2012-01-30 NA
## 4 2019-06-01 00:00:00 00S8151738… 2016-02-04 NA
## 5 2019-06-01 00:00:00 00X7404388… 2017-06-01 NA
## 6 2019-06-01 00:00:00 0180506686… 2018-10-15 NA
## 7 2019-06-01 00:00:00 0185262367… 2007-10-20 NA
## 8 2019-06-01 00:00:00 01J1125053… 2015-11-14 NA
## 9 2019-06-01 00:00:00 01O6937090… 2015-05-28 NA
## 10 2019-06-01 00:00:00 01R0858474… 2017-11-29 NA
## # … with 22,968 more rows, and 1 more variable: Billed_amount <dbl>
#Assign missing value as 0
#june$product_id[is.na(june$product_id)] <- 0
range(june$Registration_date)
## [1] "2004-08-21" "2019-06-30"
june %>% ggplot(aes(x = Registration_date ))+ geom_histogram(bins = 17)
summary(june$Billed_amount)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -17894.3 0.0 2.2 407.0 108.0 919030.9
june %>% ggplot(aes(y = Billed_amount)) + geom_boxplot()
june[june$Billed_amount <0,] #117 negative billed amount
## # A tibble: 117 x 5
## Billing_month Customer_ID Registration_da… product_id
## <dttm> <chr> <date> <dbl>
## 1 2019-06-01 00:00:00 0gL3474882… 2017-12-02 1925
## 2 2019-06-01 00:00:00 0hT1568119… 2018-08-10 1925
## 3 2019-06-01 00:00:00 16P9061271… 2015-11-30 1925
## 4 2019-06-01 00:00:00 1HA8154862… 2017-06-16 1925
## 5 2019-06-01 00:00:00 1Xb0305152… 2018-09-24 1925
## 6 2019-06-01 00:00:00 1y80992059… 2017-02-20 1925
## 7 2019-06-01 00:00:00 1zn1806137… 2008-10-10 1925
## 8 2019-06-01 00:00:00 2bX4449667… 2015-07-28 1925
## 9 2019-06-01 00:00:00 2iL2964594… 2018-01-18 1925
## 10 2019-06-01 00:00:00 2m02498354… 2011-08-23 1925
## # … with 107 more rows, and 1 more variable: Billed_amount <dbl>
june[june$Billed_amount == 0,] # 164k have 0 billed amount
## # A tibble: 164,587 x 5
## Billing_month Customer_ID Registration_da… product_id
## <dttm> <chr> <date> <dbl>
## 1 2019-06-01 00:00:00 0046765619… 2015-06-18 1958
## 2 2019-06-01 00:00:00 0046765619… 2015-06-18 2155
## 3 2019-06-01 00:00:00 0046765619… 2015-06-18 11813
## 4 2019-06-01 00:00:00 0046765619… 2015-06-18 26720
## 5 2019-06-01 00:00:00 0046765619… 2015-06-18 NA
## 6 2019-06-01 00:00:00 00F7217553… 2012-01-16 11813
## 7 2019-06-01 00:00:00 00F7217553… 2012-01-16 20597
## 8 2019-06-01 00:00:00 00F7217553… 2012-01-16 NA
## 9 2019-06-01 00:00:00 00R0968053… 2012-01-30 NA
## 10 2019-06-01 00:00:00 00S6153818… 2018-09-11 11813
## # … with 164,577 more rows, and 1 more variable: Billed_amount <dbl>
june[june$Billed_amount > 1000,] #51k have bill > 1000
## # A tibble: 51,264 x 5
## Billing_month Customer_ID Registration_da… product_id
## <dttm> <chr> <date> <dbl>
## 1 2019-06-01 00:00:00 0046765619… 2015-06-18 10
## 2 2019-06-01 00:00:00 00F7217553… 2012-01-16 10
## 3 2019-06-01 00:00:00 00F7217553… 2012-01-16 938
## 4 2019-06-01 00:00:00 00R0968053… 2012-01-30 10
## 5 2019-06-01 00:00:00 00R0968053… 2012-01-30 938
## 6 2019-06-01 00:00:00 00S6153818… 2018-09-11 10
## 7 2019-06-01 00:00:00 00S8151738… 2016-02-04 10
## 8 2019-06-01 00:00:00 00T0630891… 2018-02-14 10
## 9 2019-06-01 00:00:00 00T4181469… 2016-02-12 10
## 10 2019-06-01 00:00:00 00X7404388… 2017-06-01 8
## # … with 51,254 more rows, and 1 more variable: Billed_amount <dbl>
#Visualize smaller range of billed_amount
test_bill_june <- june[june$Billed_amount>0 & june$Billed_amount<1500,]
test_bill_june %>% ggplot(aes(Billed_amount)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#Make adoption data set based on month
adoption_april <- adoption[adoption$Month == 'April-19',]
adoption_may <- adoption[adoption$Month == 'May-19',]
adoption_june <- adoption[adoption$Month == 'June-19',]
#Inner join because some ID are unmatched
full_april <- april %>% inner_join(adoption_april, by = 'Customer_ID')%>%
select(-Month) %>%
mutate( age = time_length(difftime(as.Date(Billing_month), as.Date(Registration_date)), "years")) # Calculate age since registration
full_may <- may %>% inner_join(adoption_may, by = 'Customer_ID')%>%
select(-Month) %>%
mutate(age = time_length(difftime(as.Date(Billing_month), as.Date(Registration_date)), "years")) # Calculate age since registration
#Training and testing data
train_data <- rbind(full_april, full_may)
test_data <- june %>% inner_join(adoption_june, by = 'Customer_ID') %>%
select(-Month)%>%
mutate(age = time_length(difftime(as.Date(Billing_month), as.Date(Registration_date)), "years")) # Calculate age since registration
#Full data April May June
full_data <- rbind(train_data, test_data)
#Check NA
colSums(is.na(full_data))
## Billing_month Customer_ID Registration_date product_id
## 0 0 0 67889
## Billed_amount Customer_size Geo_Code Visualize
## 0 0 0 0
## Alert Report Tools age
## 0 0 0 0
#There is no rows that have missing product_id and non-zero billed_amount
full_data %>% filter(is.na(product_id) & Billed_amount != 0)
## # A tibble: 0 x 12
## # … with 12 variables: Billing_month <dttm>, Customer_ID <chr>,
## # Registration_date <date>, product_id <dbl>, Billed_amount <dbl>,
## # Customer_size <chr>, Geo_Code <chr>, Visualize <chr>, Alert <chr>,
## # Report <chr>, Tools <fct>, age <dbl>
#Filter out missing product_id rows
full_data<-full_data[!is.na(full_data$product_id),]
full_data <- full_data %>% filter(Registration_date !='2014-01-17' & Billed_amount <= 90000)
#Outlier
# productID with negative amount
full_data %>% filter(full_data$Billed_amount < 0) %>% group_by(product_id) %>% summarise(count =n()) #373 negative bills
## # A tibble: 3 x 2
## product_id count
## <dbl> <int>
## 1 1925 368
## 2 3991 1
## 3 90691 4
#Delete negative bill amount
full_data$Billed_amount[full_data$Billed_amount <0] <- 0
#Cap values above 95 percentile of each product id
full_data <- full_data %>%
group_by(product_id) %>%
mutate(Capped_billed_amount = squish(Billed_amount, quantile(Billed_amount, c(0.05, 0.95)))) %>%
ungroup()
#Visualize new billed amount distribution
ggplot(full_data, aes(y = Billed_amount))+geom_boxplot()
hist(full_data$Billed_amount)
hist(full_data$Capped_billed_amount)
summary(full_data$Billed_amount)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 0.00 3.15 414.15 125.39 89476.19
summary(full_data$Capped_billed_amount)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 0.00 3.28 326.23 116.00 80802.92
#Regroup product id
#find top 5 product
count_top_prod<-full_data %>% group_by(product_id) %>% summarize(count = n())
count_top_prod[order(-count_top_prod$count),]
## # A tibble: 2,554 x 2
## product_id count
## <dbl> <int>
## 1 10 136084
## 2 8 128750
## 3 10298 121700
## 4 11813 104970
## 5 1958 92445
## 6 938 90604
## 7 2310 83195
## 8 3299 63549
## 9 548 51341
## 10 11921 49448
## # … with 2,544 more rows
#Most used product is 10, 8, 10298, 11813, 1958
#use product_id = 0 as other
full_data <- full_data %>% mutate(product_id = case_when(product_id == 10 ~ 10,
product_id == 8 ~ 8,
product_id == 10298 ~ 10298,
product_id == 11813 ~ 11813,
product_id == 1958 ~ 1958,
TRUE ~ 0))
full_data$product_id <- as.factor(full_data$product_id)
range(full_data$age)
## [1] -0.07671233 14.78630137
full_data[full_data$age<0,]
## # A tibble: 1,881 x 13
## Billing_month Customer_ID Registration_da… product_id
## <dttm> <chr> <date> <fct>
## 1 2019-04-01 00:00:00 0fY8840097… 2019-04-19 10
## 2 2019-04-01 00:00:00 0fY8840097… 2019-04-19 10298
## 3 2019-04-01 00:00:00 0fY8840097… 2019-04-19 11813
## 4 2019-04-01 00:00:00 0fY8840097… 2019-04-19 0
## 5 2019-04-01 00:00:00 1An1379640… 2019-04-20 8
## 6 2019-04-01 00:00:00 1An1379640… 2019-04-20 10
## 7 2019-04-01 00:00:00 1An1379640… 2019-04-20 0
## 8 2019-04-01 00:00:00 1An1379640… 2019-04-20 0
## 9 2019-04-01 00:00:00 1An1379640… 2019-04-20 11813
## 10 2019-04-01 00:00:00 1An1379640… 2019-04-20 0
## # … with 1,871 more rows, and 9 more variables: Billed_amount <dbl>,
## # Customer_size <chr>, Geo_Code <chr>, Visualize <chr>, Alert <chr>,
## # Report <chr>, Tools <fct>, age <dbl>, Capped_billed_amount <dbl>
full_data <- full_data %>% mutate(age_group = case_when(age < 15 & age > 9 ~ 'Old customer (9-15)',
age <= 9 & age > 4 ~ 'Middle-aged customer (4-9)',
age <= 4 ~ 'Young customer (0-4)'))
#Number of product each month is number of rows grouped by month and ID
full_data <-full_data %>% group_by(Billing_month, Customer_ID) %>%
mutate(product_quant = n())
max(full_data$product_quant)
## [1] 149
full_data[(full_data$product_quant > 100),]
## # A tibble: 437 x 15
## # Groups: Billing_month, Customer_ID [3]
## Billing_month Customer_ID Registration_da… product_id
## <dttm> <chr> <date> <fct>
## 1 2019-04-01 00:00:00 s640820878… 2015-11-25 8
## 2 2019-04-01 00:00:00 s640820878… 2015-11-25 10
## 3 2019-04-01 00:00:00 s640820878… 2015-11-25 0
## 4 2019-04-01 00:00:00 s640820878… 2015-11-25 0
## 5 2019-04-01 00:00:00 s640820878… 2015-11-25 1958
## 6 2019-04-01 00:00:00 s640820878… 2015-11-25 0
## 7 2019-04-01 00:00:00 s640820878… 2015-11-25 0
## 8 2019-04-01 00:00:00 s640820878… 2015-11-25 0
## 9 2019-04-01 00:00:00 s640820878… 2015-11-25 0
## 10 2019-04-01 00:00:00 s640820878… 2015-11-25 10298
## # … with 427 more rows, and 11 more variables: Billed_amount <dbl>,
## # Customer_size <chr>, Geo_Code <chr>, Visualize <chr>, Alert <chr>,
## # Report <chr>, Tools <fct>, age <dbl>, Capped_billed_amount <dbl>,
## # age_group <chr>, product_quant <int>
#Visualize distribution
hist(full_data$product_quant)
#Count by group
prop.table(table(full_data$Customer_size))*100
##
## Large Mid Small
## 0.4903786 2.1048342 97.4047872
prop.table(table(full_data$product_id))*100
##
## 0 8 10 1958 10298 11813
## 62.704338 8.223007 8.691415 5.904278 7.772737 6.704225
prop.table(table(full_data$Geo_Code))*100
##
## AMER APAC CHNA EMEA GEO-UNCLAIMED
## 52.667607230 11.983874604 3.121293659 22.877713832 0.008111238
## GLBL JAPN
## 0.377779296 8.963620141
prop.table(table(full_data$age_group))*100
##
## Middle-aged customer (4-9) Old customer (9-15)
## 43.254101 5.830447
## Young customer (0-4)
## 50.915452
#Age distribution
ggplot(full_data, aes(x = age, fill = age_group))+ geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#Age, size and spending
full_data %>%
group_by(age_group, Customer_size) %>%
summarize(avg_bill_amount = mean(Billed_amount)) %>%
ggplot(aes(age_group, fill = Customer_size, y = avg_bill_amount))+geom_bar(stat = 'identity',position ='dodge')
#Visualize number of use per product
ggplot(full_data, aes(x = product_id)) + geom_bar()
#Visualize product id related to service use
grid.arrange(ggplot(full_data, aes(x = product_id, fill = Visualize)) + geom_bar(position = 'dodge'),
ggplot(full_data, aes(x = product_id, fill = Alert)) + geom_bar(position = 'dodge'),
ggplot(full_data, aes(x = product_id, fill = Report)) + geom_bar(position = 'dodge'), ncol = 1)
#Visualize product id related to service use (Percentage per product)
grid.arrange(
full_data %>%
group_by(product_id, Visualize) %>% summarise(count=n()) %>% mutate(pct = round(count / sum(count),2)) %>%
ggplot(aes(x = reorder(product_id,pct), y = pct, fill = Visualize)) + geom_bar(stat='identity', position = 'dodge') +
scale_y_continuous(labels = scales::percent),
full_data %>%
group_by(product_id, Alert) %>% summarise(count=n()) %>% mutate(pct = round(count / sum(count),2)) %>%
ggplot(aes(x = reorder(product_id,pct), y = pct, fill = Alert)) + geom_bar(stat='identity', position = 'dodge') +
scale_y_continuous(labels = scales::percent),
full_data %>%
group_by(product_id, Report) %>% summarise(count=n()) %>% mutate(pct = round(count / sum(count),2)) %>%
ggplot(aes(x = reorder(product_id,pct), y = pct, fill = Report)) + geom_bar(stat='identity', position = 'dodge') +
scale_y_continuous(labels = scales::percent),ncol = 1)
# ggplot(full_data, aes(product_id, Customer_size))+geom_tile()
# Contingency table with customer size
prop.table(table(full_data$product_id, full_data$Customer_size), margin=2)*100
##
## Large Mid Small
## 0 76.413128 75.169924 62.365951
## 8 4.792915 5.037019 8.309122
## 10 4.792915 5.097706 8.788698
## 1958 4.597551 4.769996 5.935368
## 10298 4.766866 5.024882 7.847249
## 11813 4.636624 4.900473 6.753612
# Understand based on size
grid.arrange(ggplot(full_data, aes(x = Customer_size, fill = Visualize)) + geom_bar(position = 'dodge'),
ggplot(full_data, aes(x = Customer_size, fill = Alert)) + geom_bar(position = 'dodge'),
ggplot(full_data, aes(x = Customer_size, fill = Report)) + geom_bar(position = 'dodge'), ncol = 1)
grid.arrange(
full_data %>%
group_by(Customer_size, Visualize) %>% summarise(count=n()) %>% mutate(pct = round(count / sum(count),2)) %>%
ggplot(aes(x = reorder(Customer_size,pct), y = pct, fill = Visualize)) + geom_bar(stat='identity', position = 'dodge') +
scale_y_continuous(labels = scales::percent),
full_data %>%
group_by(Customer_size, Alert) %>% summarise(count=n()) %>% mutate(pct = round(count / sum(count),2)) %>%
ggplot(aes(x = reorder(Customer_size,pct), y = pct, fill = Alert)) + geom_bar(stat='identity', position = 'dodge') +
scale_y_continuous(labels = scales::percent),
full_data %>%
group_by(Customer_size, Report) %>% summarise(count=n()) %>% mutate(pct = round(count / sum(count),2)) %>%
ggplot(aes(x = reorder(Customer_size,pct), y = pct, fill = Report)) + geom_bar(stat='identity', position = 'dodge') +
scale_y_continuous(labels = scales::percent),ncol = 1)
full_data %>%
group_by(Geo_Code) %>%
summarize(age = round(mean(age),1),
Billed_amount = mean(Billed_amount),
num_product = mean(product_quant),
num_unique_customer = length(unique(Customer_ID)))
## # A tibble: 7 x 5
## Geo_Code age Billed_amount num_product num_unique_customer
## <chr> <dbl> <dbl> <dbl> <int>
## 1 AMER 4.7 440. 14.9 26623
## 2 APAC 3.7 376. 13.6 6982
## 3 CHNA 2.6 650. 10.5 2568
## 4 EMEA 4.3 357. 14.8 11728
## 5 GEO-UNCLAIMED 1.1 3424. 5.71 24
## 6 GLBL 3.6 676. 12.5 244
## 7 JAPN 3.5 365. 14.1 4575
# Revenue by customers_age - young and medium group occupy more than 90%
pie_custage <- full_data %>% group_by(age_group) %>%
summarize(Total_exp = sum(Capped_billed_amount)) %>%
mutate(perc = Total_exp / sum(Total_exp))
p <- plot_ly(pie_custage, labels = ~age_group, values = ~perc) %>%
add_pie(hole = 0.6) %>%
layout(title = 'revenue by customer_age group across 3 months',
xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))
p
# Revenue by customers_size - more than 90% of business is driven by small customers
pie_custsize <- full_data %>% group_by(Customer_size) %>%
summarize(Total_exp = sum(Capped_billed_amount)) %>%
mutate(perc = Total_exp / sum(Total_exp))
p <- plot_ly(pie_custsize, labels = ~Customer_size, values = ~perc) %>%
add_pie(hole = 0.6) %>%
layout(title = 'revenue by customer_size across 3 months',
xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))
p
# Revenue by Geo_Code - 75% of revenue are from AMER & EMER
pie_geo <- full_data %>% group_by(Geo_Code) %>%
summarize(Total_exp = sum(Capped_billed_amount)) %>%
mutate(perc = Total_exp / sum(Total_exp))
p <- plot_ly(pie_geo, labels = ~Geo_Code, values = ~perc) %>%
add_pie(hole = 0.6) %>%
layout(title = 'Revenue by Geo_Code accross 3 months',
xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))
p
# top10 customers across 3 months
top10 <- full_data %>% group_by(Customer_ID,Billing_month) %>% mutate(Monthly_exp = sum(Capped_billed_amount))%>% ungroup() %>% select(-c(product_id,Billed_amount,Capped_billed_amount)) %>% distinct() %>% arrange(-Monthly_exp) %>% group_by(Billing_month) %>% mutate(rank = row_number()) %>% filter(rank<=10) %>% arrange(Billing_month,-Monthly_exp)
top10
## # A tibble: 30 x 14
## # Groups: Billing_month [3]
## Billing_month Customer_ID Registration_da… Customer_size Geo_Code
## <dttm> <chr> <date> <chr> <chr>
## 1 2019-04-01 00:00:00 j590734619… 2017-02-24 Large AMER
## 2 2019-04-01 00:00:00 GG50789640… 2016-05-13 Large AMER
## 3 2019-04-01 00:00:00 Wha3477019… 2014-04-15 Mid AMER
## 4 2019-04-01 00:00:00 B024432886… 2019-03-07 Mid APAC
## 5 2019-04-01 00:00:00 CcA0911110… 2010-09-23 Mid AMER
## 6 2019-04-01 00:00:00 ivv5206204… 2018-01-15 Mid APAC
## 7 2019-04-01 00:00:00 EZT7940349… 2018-04-03 Mid GEO-UNC…
## 8 2019-04-01 00:00:00 XRM4187246… 2018-08-23 Small APAC
## 9 2019-04-01 00:00:00 Pga0074070… 2016-11-28 Mid AMER
## 10 2019-04-01 00:00:00 Moa5298658… 2018-03-23 Large AMER
## # … with 20 more rows, and 9 more variables: Visualize <chr>, Alert <chr>,
## # Report <chr>, Tools <fct>, age <dbl>, age_group <chr>,
## # product_quant <int>, Monthly_exp <dbl>, rank <int>
# visualize the customer_age group for the customers
pie_top10 <- top10 %>% group_by(age_group) %>%
summarize(count = n()) %>%
mutate(perc = count / sum(count))
p <- plot_ly(pie_top10, labels = ~age_group, values = ~perc) %>%
add_pie(hole = 0.6) %>%
layout(title = 'customer_age group of top10 revenue customer across 3 months',
xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))
p
p <- full_data %>% group_by(Customer_ID,Billing_month) %>% mutate(Monthly_exp = sum(Capped_billed_amount))%>% ungroup() %>%
select(age,Monthly_exp,Customer_size) %>% distinct() %>% mutate (Customer_age= as.factor(age))
ggplot(p, aes(Customer_age,Monthly_exp, color=Customer_size)) + geom_point(alpha=.5) + scale_size_area()
# Tools subscription of within same size customers
tools <- full_data %>% group_by(Customer_size,Tools) %>%
summarise(count = n ()) %>% group_by(Customer_size) %>%
mutate(count_g = sum (count)) %>% mutate(perc = count / count_g)
tools
## # A tibble: 24 x 5
## # Groups: Customer_size [3]
## Customer_size Tools count count_g perc
## <chr> <fct> <int> <int> <dbl>
## 1 Large A 178 7678 0.0232
## 2 Large AR 169 7678 0.0220
## 3 Large N 1538 7678 0.200
## 4 Large R 555 7678 0.0723
## 5 Large V 2880 7678 0.375
## 6 Large VA 469 7678 0.0611
## 7 Large VAR 444 7678 0.0578
## 8 Large VR 1445 7678 0.188
## 9 Mid A 677 32956 0.0205
## 10 Mid AR 433 32956 0.0131
## # … with 14 more rows
p <- ggplot(data = tools,aes(x=Customer_size,y=perc,fill=Tools)) + geom_bar(stat="identity",position='fill')
p
# Analyze Tools subscription of large size customers
p <- tools %>% filter(Customer_size == 'Large') %>% plot_ly(labels = ~Tools, values = ~perc) %>%
add_pie(hole = 0.6) %>%
layout(title = 'Product Expenditures by Customer_size accross 3 months',
xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))
p
# Tools subscription of within same customer age group
tools <- full_data %>% group_by(age_group,Tools) %>%
summarise(count = n ()) %>% group_by(age_group) %>%
mutate(count_g = sum (count)) %>% mutate(perc = count / count_g)
tools
## # A tibble: 24 x 5
## # Groups: age_group [3]
## age_group Tools count count_g perc
## <chr> <fct> <int> <int> <dbl>
## 1 Middle-aged customer (4-9) A 20558 677242 0.0304
## 2 Middle-aged customer (4-9) AR 2988 677242 0.00441
## 3 Middle-aged customer (4-9) N 405220 677242 0.598
## 4 Middle-aged customer (4-9) R 26005 677242 0.0384
## 5 Middle-aged customer (4-9) V 170601 677242 0.252
## 6 Middle-aged customer (4-9) VA 20418 677242 0.0301
## 7 Middle-aged customer (4-9) VAR 6157 677242 0.00909
## 8 Middle-aged customer (4-9) VR 25295 677242 0.0374
## 9 Old customer (9-15) A 3272 91289 0.0358
## 10 Old customer (9-15) AR 535 91289 0.00586
## # … with 14 more rows
p <- ggplot(data = tools,aes(x=age_group,y=perc,fill=Tools)) + geom_bar(stat="identity",position='fill')
p
# Tools subscription of within same Geo_code customer
tools <- full_data %>% group_by(Geo_Code,Tools) %>%
summarise(count = n ()) %>% group_by(Geo_Code) %>%
mutate(count_g = sum (count)) %>% mutate(perc = count / count_g)
tools
## # A tibble: 49 x 5
## # Groups: Geo_Code [7]
## Geo_Code Tools count count_g perc
## <chr> <fct> <int> <int> <dbl>
## 1 AMER A 29686 824632 0.0360
## 2 AMER AR 3712 824632 0.00450
## 3 AMER N 503992 824632 0.611
## 4 AMER R 30148 824632 0.0366
## 5 AMER V 193970 824632 0.235
## 6 AMER VA 26738 824632 0.0324
## 7 AMER VAR 7258 824632 0.00880
## 8 AMER VR 29128 824632 0.0353
## 9 APAC A 7594 187635 0.0405
## 10 APAC AR 959 187635 0.00511
## # … with 39 more rows
p <- ggplot(data = tools,aes(x=Geo_Code,y=perc,fill=Tools)) + geom_bar(stat="identity",position='fill')
p
# Analyze Tools subscription of customers in Chinese Market
p <- tools %>% filter(Geo_Code == 'CHNA') %>% plot_ly(labels = ~Tools, values = ~perc) %>%
add_pie(hole = 0.6) %>%
layout(title = 'Tools subscription in China accross 3 months',
xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))
p
# save(full_data, file = 'full_data.RData')
# write.csv(full_data, 'full_data.csv')